home *** CD-ROM | disk | FTP | other *** search
/ Amiga Format CD 39 / Amiga Format CD39 (1999-04-13)(Future Publishing)(GB)[!][issue 1999-05].iso / -seriously_amiga- / graphics / ripley / source / idctppc.asm < prev    next >
Assembly Source File  |  1999-03-02  |  9KB  |  537 lines

  1. #********************************************************************
  2. #
  3. #    idctppc.asm
  4. #
  5. #    1.0 - 20.9.98 (copper)    written in assembler
  6. #
  7. #********************************************************************
  8.  
  9.             .globl    Initialize_Fast_IDCT
  10.             .globl    Fast_IDCT
  11.  
  12.             .set    W1,2841
  13.             .set    W2,2676
  14.             .set    W3,2408
  15.             .set    W5,1609
  16.             .set    W6,1108
  17.             .set    W7,565
  18.  
  19.             .bss    iclip,1024,4
  20.             .bss    iclp,1024,4
  21.  
  22.  
  23.             .set    blk,r12
  24.             .set    x0,r14
  25.             .set    x1,r15
  26.             .set    x2,r16
  27.             .set    x3,r17
  28.             .set    x4,r18
  29.             .set    x5,r19
  30.             .set    x6,r20
  31.             .set    x7,r21
  32.             .set    x8,r11
  33.  
  34. #********************************************************************
  35.             .text 
  36.  
  37. Initialize_Fast_IDCT:
  38.  
  39.             addi    r1,r1,-36
  40.             stswi    r13,r1,0            # save registers
  41.             stw        r21,32(r1)
  42.  
  43.             lis        r3,iclip@ha
  44.             addi    r3,r3,iclip@l
  45.  
  46.             li        r4,-256
  47.             li        r5,256
  48.             mtctr    r5                    
  49.             subi    r3,r3,4
  50. iniloop:
  51.             sthu    r4,2(r3)
  52.             bdnz    iniloop
  53.  
  54.             li        r4,-255
  55.             li        r5,512
  56.             mtctr    r5                    
  57.             subi    r3,r3,4
  58. iniloop1:
  59.             sthu    r4,2(r3)
  60.             addi    r4,r4,1
  61.             bdnz    iniloop1
  62.  
  63.             li        r4,-255
  64.             li        r5,256
  65.             mtctr    r5                    
  66.             subi    r3,r3,4
  67. iniloop2:
  68.             sthu    r4,2(r3)
  69.             bdnz    iniloop2
  70.  
  71.             lswi    r13,r1,0            # restore registers
  72.             lwz        r21,32(r1)
  73.             addi    r1,r1,36
  74.             blr                            #rts
  75.  
  76. #********************************************************************
  77.  
  78.             .macro    idctrow
  79.  
  80. #  /* shortcut */
  81. # if (!((x1 = blk[4]<<11) | (x2 = blk[6]) | (x3 = blk[2]) |
  82. #        (x4 = blk[1]) | (x5 = blk[7]) | (x6 = blk[5]) | (x7 = blk[3])))
  83. #  {
  84. #    blk[0]=blk[1]=blk[2]=blk[3]=blk[4]=blk[5]=blk[6]=blk[7]=blk[0]<<3;
  85. #    return;
  86. #  }
  87.  
  88.             lhz        x1,4*2(blk)
  89.             extsh    x1,x1
  90.             rlwinm    x1,x1,11,0,20
  91.             mr        r31,x1
  92.             lhz        x2,6*2(blk)
  93.             extsh    x2,x2
  94.             or        r31,r31,x2
  95.             lhz        x3,2*2(blk)
  96.             extsh    x3,x3
  97.             or        r31,r31,x3
  98.             lhz        x4,1*2(blk)
  99.             extsh    x4,x4
  100.             or        r31,r31,x4
  101.             lhz        x5,7*2(blk)
  102.             extsh    x5,x5
  103.             or        r31,r31,x5
  104.             lhz        x6,5*2(blk)
  105.             extsh    x6,x6
  106.             or        r31,r31,x6
  107.             lhz        x7,3*2(blk)
  108.             extsh    x7,x7
  109.             or        r31,r31,x7
  110.             cmpwi    r31,0
  111.             bne        nozero
  112.  
  113. #    blk[0]=blk[1]=blk[2]=blk[3]=blk[4]=blk[5]=blk[6]=blk[7]=blk[0]<<3;
  114. #    return;
  115.  
  116.             lha        r31,0*2(blk)
  117.             extsh    r31,r31
  118.             rlwinm    r31,r31,3,0,28
  119.             sth        r31,0*2(blk)
  120.             sth        r31,1*2(blk)
  121.             sth        r31,2*2(blk)
  122.             sth        r31,3*2(blk)
  123.             sth        r31,4*2(blk)
  124.             sth        r31,5*2(blk)
  125.             sth        r31,6*2(blk)
  126.             sth        r31,7*2(blk)
  127.             
  128.             b        ok
  129.  
  130. nozero:
  131.  
  132. #  x0 = (blk[0]<<11) + 128; /* for proper rounding in the fourth stage */
  133.  
  134.             lhz        x0,0(blk)
  135.             extsh    x0,x0
  136.             rlwinm    x0,x0,11,0,20
  137.             addi    x0,x0,128
  138.  
  139. #  /* first stage */
  140. #  x8 = W7*(x4+x5);
  141. #  x4 = x8 + (W1-W7)*x4;
  142.  
  143.             add        x8,x4,x5
  144.             mulli    x8,x8,W7
  145.             mulli    x4,x4,(W1-W7)
  146.             add        x4,x8,x4
  147.  
  148. #  x5 = x8 - (W1+W7)*x5;
  149. #  x8 = W3*(x6+x7);
  150.  
  151.             mulli    x5,x5,(W1+W7)
  152.             sub        x5,x8,x5
  153.             add        x8,x6,x7
  154.             mulli    x8,x8,W3
  155.  
  156. #  x6 = x8 - (W3-W5)*x6;
  157. #  x7 = x8 - (W3+W5)*x7;
  158.   
  159.             mulli    x6,x6,(W3-W5)
  160.             sub        x6,x8,x6
  161.             mulli    x7,x7,(W3+W5)
  162.             sub        x7,x8,x7
  163.  
  164. #  /* second stage */
  165. #  x8 = x0 + x1;
  166. #  x0 -= x1;
  167.  
  168.             add        x8,x0,x1
  169.             sub        x0,x0,x1    
  170.  
  171. #  x1 = W6*(x3+x2);
  172. #  x2 = x1 - (W2+W6)*x2;
  173.  
  174.             add        x1,x3,x2
  175.             mulli    x1,x1,W6
  176.             mulli    x2,x2,(W2+W6)
  177.             sub        x2,x1,x2
  178.  
  179. #  x3 = x1 + (W2-W6)*x3;
  180. #  x1 = x4 + x6;
  181. #  x4 -= x6;
  182. #  x6 = x5 + x7;
  183. #  x5 -= x7;
  184.  
  185.             mulli    x3,x3,(W2-W6)
  186.             add        x3,x1,x3
  187.             add        x1,x4,x6
  188.             sub        x4,x4,x6
  189.             add        x6,x5,x7
  190.             sub        x5,x5,x7
  191.  
  192. #  /* third stage */
  193. #  x7 = x8 + x3;
  194. #  x8 -= x3;
  195. #  x3 = x0 + x2;
  196. #  x0 -= x2;
  197.  
  198.             add        x7,x8,x3
  199.             sub        x8,x8,x3
  200.             add        x3,x0,x2
  201.             sub        x0,x0,x2
  202.  
  203. #  x2 = (181*(x4+x5)+128)>>8;
  204. #  x4 = (181*(x4-x5)+128)>>8;
  205.  
  206.             add        x2,x4,x5
  207.             mulli    x2,x2,181
  208.             addi    x2,x2,128
  209.             srawi    x2,x2,8
  210.             sub        x4,x4,x5
  211.             mulli    x4,x4,181
  212.             addi    x4,x4,128
  213.             srawi    x4,x4,8
  214.   
  215. #  /* fourth stage */
  216. #  blk[0] = (x7+x1)>>8;
  217. #  blk[1] = (x3+x2)>>8;
  218. #  blk[2] = (x0+x4)>>8;
  219. #  blk[3] = (x8+x6)>>8;
  220. #  blk[4] = (x8-x6)>>8;
  221. #  blk[5] = (x0-x4)>>8;
  222. #  blk[6] = (x3-x2)>>8;
  223. #  blk[7] = (x7-x1)>>8;
  224.  
  225.             add        r31,x7,x1
  226.             srawi    r31,r31,8
  227.             extsh    r31,r31
  228.             sth        r31,0(blk)
  229.             add        r31,x3,x2
  230.             srawi    r31,r31,8
  231.             extsh    r31,r31
  232.             sth        r31,1*2(blk)
  233.             add        r31,x0,x4
  234.             srawi    r31,r31,8
  235.             extsh    r31,r31
  236.             sth        r31,2*2(blk)
  237.             add        r31,x8,x6
  238.             srawi    r31,r31,8
  239.             extsh    r31,r31
  240.             sth        r31,3*2(blk)
  241.             sub        r31,x8,x6
  242.             srawi    r31,r31,8
  243.             extsh    r31,r31
  244.             sth        r31,4*2(blk)
  245.             sub        r31,x0,x4
  246.             srawi    r31,r31,8
  247.             extsh    r31,r31
  248.             sth        r31,5*2(blk)
  249.             sub        r31,x3,x2
  250.             srawi    r31,r31,8
  251.             extsh    r31,r31
  252.             sth        r31,6*2(blk)
  253.             sub        r31,x7,x1
  254.             srawi    r31,r31,8
  255.             extsh    r31,r31
  256.             sth        r31,7*2(blk)
  257.  
  258. ok:
  259.             .endm
  260. #-----------------------------------------------------------------
  261.             .macro    idctcol
  262.  
  263.  
  264. #  /* shortcut */
  265. #  if (!((x1 = (blk[8*4]<<8)) | (x2 = blk[8*6]) | (x3 = blk[8*2]) |
  266. #        (x4 = blk[8*1]) | (x5 = blk[8*7]) | (x6 = blk[8*5]) | (x7 = blk[8*3])))
  267.  
  268.  
  269.             lhz        x1,8*4*2(blk)
  270.             extsh    x1,x1
  271.             rlwinm    x1,x1,8,0,23
  272.             mr        r31,x1
  273.             lhz        x2,8*6*2(blk)
  274.             extsh    x2,x2
  275.             or        r31,r31,x2
  276.             lhz        x3,8*2*2(blk)
  277.             extsh    x3,x3
  278.             or        r31,r31,x3
  279.             lhz        x4,8*1*2(blk)
  280.             extsh    x4,x4
  281.             or        r31,r31,x4
  282.             lhz        x5,8*7*2(blk)
  283.             extsh    x5,x5
  284.             or        r31,r31,x5
  285.             lhz        x6,8*5*2(blk)
  286.             extsh    x6,x6
  287.             or        r31,r31,x6
  288.             lhz        x7,8*3*2(blk)
  289.             extsh    x7,x7
  290.             or        r31,r31,x7
  291.             cmpwi    r31,0
  292.             bne        colnozero
  293.  
  294.  
  295. #    blk[8*0]=blk[8*1]=blk[8*2]=blk[8*3]=blk[8*4]=blk[8*5]=blk[8*6]=blk[8*7]=
  296. #      iclp[(blk[8*0]+32)>>6];
  297.  
  298.             lis        r30,iclp@ha
  299.             addi    r30,r30,iclp@l
  300.             
  301.             lhz        r31,8*0*2(blk)
  302.             extsh    r31,r31
  303.             addi    r31,r31,32
  304.             srawi    r31,r31,6
  305.             rlwinm    r31,r31,1,0,30            # * 2
  306.             lhzx    r31,r30,r31
  307.             extsh    r31,r31
  308.             sth        r31,8*0*2(blk)
  309.             sth        r31,8*1*2(blk)
  310.             sth        r31,8*2*2(blk)
  311.             sth        r31,8*3*2(blk)
  312.             sth        r31,8*4*2(blk)
  313.             sth        r31,8*5*2(blk)
  314.             sth        r31,8*6*2(blk)
  315.             sth        r31,8*7*2(blk)
  316.             
  317.             b        ok1
  318.  
  319. colnozero:
  320.  
  321. #  x0 = (blk[8*0]<<8) + 8192;
  322.  
  323.             lhz        x0,8*0(blk)
  324.             extsh    x0,x0
  325.             rlwinm    x0,x0,8,0,23
  326.             addi    x0,x0,8192
  327.             
  328. #  /* first stage */
  329. #  x8 = W7*(x4+x5) + 4;
  330. #  x4 = (x8+(W1-W7)*x4)>>3;
  331.  
  332. #  /* first stage */
  333. #  x8 = W7*(x4+x5);
  334. #  x4 = x8 + (W1-W7)*x4;
  335.  
  336.             add        x8,x4,x5
  337.             mulli    x8,x8,W7
  338.             addi    x8,x8,4
  339.             mulli    x4,x4,(W1-W7)
  340.             add        x4,x8,x4
  341.             srawi    x4,x4,3
  342.  
  343. #  x5 = (x8-(W1+W7)*x5)>>3;
  344. #  x8 = W3*(x6+x7) + 4;
  345.  
  346.             mulli    x5,x5,(W1+W7)
  347.             sub        x5,x8,x5
  348.             srawi    x5,x5,3
  349.             add        x8,x6,x7
  350.             mulli    x8,x8,W3
  351.             addi    x8,x8,4
  352.  
  353. #  x6 = (x8-(W3-W5)*x6)>>3;
  354. #  x7 = (x8-(W3+W5)*x7)>>3;
  355.  
  356.             mulli    x6,x6,(W3-W5)
  357.             sub        x6,x8,x6
  358.             srawi    x6,x6,3
  359.             mulli    x7,x7,(W3+W5)
  360.             sub        x7,x8,x7
  361.             srawi    x7,x7,3
  362.  
  363. #  /* second stage */
  364. #  x8 = x0 + x1;
  365. #  x0 -= x1;
  366.  
  367.             add        x8,x0,x1
  368.             sub        x0,x0,x1    
  369.  
  370. #  x1 = W6*(x3+x2) + 4;
  371. #  x2 = (x1-(W2+W6)*x2)>>3;
  372.  
  373.             add        x1,x3,x2
  374.             mulli    x1,x1,W6
  375.             addi    x1,x1,4
  376.             mulli    x2,x2,(W2+W6)
  377.             sub        x2,x1,x2
  378.             srawi    x2,x2,3
  379.  
  380. #  x3 = (x1+(W2-W6)*x3)>>3;
  381. #  x1 = x4 + x6;
  382. #  x4 -= x6;
  383. #  x6 = x5 + x7;
  384. #  x5 -= x7;
  385.  
  386.             mulli    x3,x3,(W2-W6)
  387.             add        x3,x1,x3
  388.             srawi    x3,x3,3
  389.             add        x1,x4,x6
  390.             sub        x4,x4,x6
  391.             add        x6,x5,x7
  392.             sub        x5,x5,x7
  393.  
  394. #  /* third stage */
  395. #  x7 = x8 + x3;
  396. #  x8 -= x3;
  397. #  x3 = x0 + x2;
  398. #  x0 -= x2;
  399.  
  400.             add        x7,x8,x3
  401.             sub        x8,x8,x3
  402.             add        x3,x0,x2
  403.             sub        x0,x0,x2
  404.  
  405. #  x2 = (181*(x4+x5)+128)>>8;
  406. #  x4 = (181*(x4-x5)+128)>>8;
  407.  
  408.             add        x2,x4,x5
  409.             mulli    x2,x2,181
  410.             addi    x2,x2,128
  411.             srawi    x2,x2,8
  412.             extsh    x2,x2
  413.             sub        x4,x4,x5
  414.             mulli    x4,x4,181
  415.             addi    x4,x4,128
  416.             srawi    x4,x4,8
  417.             extsh    x4,x4
  418.  
  419. # /* fourth stage */
  420. #  blk[8*0] = iclp[(x7+x1)>>14];
  421. #  blk[8*1] = iclp[(x3+x2)>>14];
  422. #  blk[8*2] = iclp[(x0+x4)>>14];
  423. #  blk[8*3] = iclp[(x8+x6)>>14];
  424. #  blk[8*4] = iclp[(x8-x6)>>14];
  425. #  blk[8*5] = iclp[(x0-x4)>>14];
  426. #  blk[8*6] = iclp[(x3-x2)>>14];
  427. #  blk[8*7] = iclp[(x7-x1)>>14];
  428.  
  429. # clipping erstmal wegelassen
  430.  
  431.             lis        r30,iclp@ha
  432.             addi    r30,r30,iclp@l
  433.  
  434.             add        r31,x7,x1
  435.             srawi    r31,r31,14
  436.             rlwinm    r31,r31,1,0,30
  437.             lhzx    r29,r30,r31
  438.             extsh    r29,r29
  439.             sth        r29,0(blk)
  440.             add        r31,x3,x2
  441.             srawi    r31,r31,14
  442.             rlwinm    r31,r31,1,0,30
  443.             lhzx    r29,r30,r31
  444.             extsh    r29,r29
  445.             sth        r29,8*1*2(blk)
  446.             add        r31,x0,x4
  447.             srawi    r31,r31,14
  448.             rlwinm    r31,r31,1,0,30
  449.             lhzx    r29,r30,r31
  450.             extsh    r29,r29
  451.             sth        r29,8*2*2(blk)
  452.             add        r31,x8,x6
  453.             srawi    r31,r31,14
  454.             rlwinm    r31,r31,1,0,30
  455.             lhzx    r29,r30,r31
  456.             extsh    r29,r29
  457.             sth        r29,8*3*2(blk)
  458.             sub        r31,x8,x6
  459.             srawi    r31,r31,14
  460.             rlwinm    r31,r31,1,0,30
  461.             lhzx    r29,r30,r31
  462.             extsh    r29,r29
  463.             sth        r29,8*4*2(blk)
  464.             sub        r31,x0,x4
  465.             srawi    r31,r31,14
  466.             rlwinm    r31,r31,1,0,30
  467.             lhzx    r29,r30,r31
  468.             extsh    r29,r29
  469.             sth        r29,8*5*2(blk)
  470.             sub        r31,x3,x2
  471.             srawi    r31,r31,14
  472.             rlwinm    r31,r31,1,0,30
  473.             lhzx    r29,r30,r31
  474.             extsh    r29,r29
  475.             sth        r29,8*6*2(blk)
  476.             sub        r31,x7,x1
  477.             srawi    r31,r31,14
  478.             rlwinm    r31,r31,1,0,30
  479.             lhzx    r29,r30,r31
  480.             extsh    r29,r29
  481.             sth        r29,8*7*2(blk)
  482.  
  483. ok1:
  484.             .endm
  485.  
  486. #-----------------------------------------------------------------------
  487. Fast_IDCT:
  488.  
  489. #  int i;
  490.             .set    i,r5
  491.             
  492.             .align    4
  493.             
  494.             addi    r1,r1,-36
  495.             stswi    r13,r1,0            # save registers
  496.             stw        r21,32(r1)
  497.  
  498. #  for (i=0; i<8; i++)
  499. #    idctrow(block+8*i);
  500.  
  501.             li        i,0
  502.  
  503. rowloop:
  504.             rlwinm    blk,i,4,0,27
  505.             add        blk,blk,r3
  506.             
  507.             idctrow
  508.             
  509.             addi    i,i,1
  510.             cmpwi    i,8
  511.             bne        rowloop
  512.  
  513. #  for (i=0; i<8; i++)
  514. #    idctcol(block+i);
  515.  
  516.             li        i,0
  517. colllop:
  518.             add        blk,i,i
  519.             add        blk,r3,blk
  520.             
  521.             idctcol
  522.             
  523.             addi    i,i,1
  524.             cmpwi    i,8
  525.             bne        colllop
  526.  
  527.             lswi    r13,r1,0            # restore registers
  528.             lwz        r21,32(r1)
  529.             addi    r1,r1,36
  530.             blr                            #rts
  531.  
  532.  
  533.             .type    Initialize_Fast_IDCT,@function
  534.             .size    Initialize_Fast_IDCT,$-Initialize_Fast_IDCT
  535.             .type    Fast_IDCT,@function
  536.             .size    Fast_IDCT,$-Fast_IDCT
  537.